session=list()
for(i in 1:18){
session[[i]]=readRDS(paste('/Users/zejianzhang/Downloads/sessions/session',i,'.rds',sep=''))
# print(session[[i]]$mouse_name)
# print(session[[i]]$date_exp)
}
# Define a variable 'n.session'and let it represents the number of sessions
n.session = length(session)
# Using the tibble() to create a tibble called meta which store the metadata information from each session
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Setting the initial value of meta by using rep().
meta <- tibble(
mouse_name = rep('name',n.session),
date_exp = rep('dt',n.session),
n_brain_area = rep(0,n.session),
n_neurons = rep(0,n.session),
n_trials = rep(0,n.session),
success_rate = rep(0,n.session)
)
# Start a for loop which can iterate over each session
for(i in 1:n.session){
# Assign i-th session to tmp
tmp = session[[i]];
# Extract information from tmp and assign it to the meta tibble respectively.
# tmp$mouse_name is assigned to meta[i,1]
meta[i,1] = tmp$mouse_name;
# tmp$date_exp is assigned to meta[i,2]
meta[i,2] = tmp$date_exp;
# the number of unique brain areas in tmp is assigned to meta[i,3]
meta[i,3] = length(unique(tmp$brain_area));
# the number of neurons, which is the number of rows in spks of tmp is assigned to meta[i,4]
meta[i,4] = dim(tmp$spks[[1]])[1];
# the number of trials, which is the lenght of feedback_type in tmp is assigned to meta[i,5]
meta[i,5] = length(tmp$feedback_type);
# the success rate, which is the mean of feedback_type + 1 of tmp divided by 2 is assigned to meta[i,6]
meta[i,6] = mean(tmp$feedback_type + 1)/2;
}
library(knitr)
# Create a HTML table by kabel() from meta tibble with 2 decimal places.
kable(meta, format = "html", table.attr = "class = 'table table-striped'", digits = 2)
| mouse_name | date_exp | n_brain_area | n_neurons | n_trials | success_rate |
|---|---|---|---|---|---|
| Cori | 2016-12-14 | 8 | 734 | 114 | 0.61 |
| Cori | 2016-12-17 | 5 | 1070 | 251 | 0.63 |
| Cori | 2016-12-18 | 11 | 619 | 228 | 0.66 |
| Forssmann | 2017-11-01 | 11 | 1769 | 249 | 0.67 |
| Forssmann | 2017-11-02 | 10 | 1077 | 254 | 0.66 |
| Forssmann | 2017-11-04 | 5 | 1169 | 290 | 0.74 |
| Forssmann | 2017-11-05 | 8 | 584 | 252 | 0.67 |
| Hench | 2017-06-15 | 15 | 1157 | 250 | 0.64 |
| Hench | 2017-06-16 | 12 | 788 | 372 | 0.69 |
| Hench | 2017-06-17 | 13 | 1172 | 447 | 0.62 |
| Hench | 2017-06-18 | 6 | 857 | 342 | 0.80 |
| Lederberg | 2017-12-05 | 12 | 698 | 340 | 0.74 |
| Lederberg | 2017-12-06 | 15 | 983 | 300 | 0.80 |
| Lederberg | 2017-12-07 | 10 | 756 | 268 | 0.69 |
| Lederberg | 2017-12-08 | 8 | 743 | 404 | 0.76 |
| Lederberg | 2017-12-09 | 6 | 474 | 280 | 0.72 |
| Lederberg | 2017-12-10 | 6 | 565 | 224 | 0.83 |
| Lederberg | 2017-12-11 | 10 | 1090 | 216 | 0.81 |
# I want to analyze session 2
i.s = 2
# I want to analyze trial 1
i.t = 1
# spike data for the session 2 and trial 1
spk.trial = session[[i.s]]$spks[[i.t]]
# extract brain area information from the session 2
area = session[[i.s]]$brain_area
# by apply(), we apply to each row of the spk.trial and calculate the sum of spike counts for each neuron
spk.count = apply(spk.trial, 1, sum)
# calculate the mean of spike count for each area
spk.average.tapply = tapply(spk.count, area, mean)
# create a data frame for area and spikes, where area contains the brain area information and spikes contains the spike counts.
tmp <- data.frame(
area = area,
spikes = spk.count
)
# By pipe operator, we can operate multiple operation together. Group the data by area column, which means that the following operations are performed for each unique area. Also, we calculate the mean of spikes column with each group defined by area. The data frame will contain both area and mean columns.
spk.average.dplyr = tmp %>%
group_by(area) %>%
summarize(mean = mean(spikes))
# i.t and this_session are two parameters we considered.
average_spike_area <- function(i.t, this_session){
spk.trial = this_session$spks[[i.t]] # extract the specified i.t trail from this_session
area = this_session$brain_area # extract the brain area information from this_session
spk.count = apply(spk.trial, 1, sum) # by apply(), we apply to each row of the spk.trial and calculate the sum of spike counts for each neuron
spk.average.tapply = tapply(spk.count, area, mean) # calculate the mean of spike count for each area
return(spk.average.tapply) # returns to the average spike count for each unique area in the specifically trial.
}
average_spike_area(1, this_session = session[[i.s]]) # calculate the average spike count for the first trail in this_session
## CA1 POST root VISl VISpm
## 1.121053 1.816754 1.538462 1.398268 2.000000
# calculate the elements in feedback_type of each trial in session 2
n.trial = length(session[[i.s]]$feedback_type)
# calculate the number of unique brain areas in session 2
n.area = length(unique(session[[i.s]]$brain_area ))
# create a matrix with n.trial rows and n.are + 1 + 2 + 1 columns
# n.trial is the number of elements in feedback_type
# n.area is the number of unique brain areas in the session; + 1 + 2 + 1 to get additional column for feedback_type, contrast_left, and contrast_right for each trial.
trial.summary = matrix(nrow = n.trial, ncol = n.area + 1 + 2 + 1)
# set a for loop that iterates from 1 to n.trial which is the number of trials in the trial.
for(i.t in 1:n.trial){
# for i.t th row in the trial.summary matrix, then create a vector containing following elements.
trial.summary[i.t,] = c(average_spike_area(i.t, this_session = session[[i.s]]), # average spike counts for each brain area for the current trial
session[[i.s]]$feedback_type[i.t], # feedback_type for the current trial
session[[i.s]]$contrast_left[i.t], # contrast_left value for the current trial
session[[i.s]]$contrast_right[i.s], # contrast_right value for the current trial
i.t) # the trial id
}
# get the name of the brain areas from average_spike_area for the current trial and session. The name is the brain area where we calculated. Also combined with 'feedback', 'left contr.','right contr.','id'.
colnames(trial.summary) = c(names(average_spike_area(i.t,this_session = session[[i.s]])), 'feedback', 'left contr.','right contr.','id' )
# convert matrix into tibble
trial.summary <- as_tibble(trial.summary)
# assign colors to each unique brain area based on n.area. 0.7 control the transparency of colors.
area.col = rainbow(n = n.area, alpha = 0.7)
# create a plot for spikes for each brain area in each session.
plot(x = 1,y = 0, col = 'white', xlim = c(0,n.trial), ylim = c(0.5,2.2), xlab = "Trials", ylab = "Average spike counts", main = paste("Spikes per area in Session", i.s))
# add lines for the plot for each area in the trial.summary.
for(i in 1:n.area){
lines(y = trial.summary[[i]], x = trial.summary$id, col = area.col[i], lty = 2, lwd = 1)
lines(smooth.spline(trial.summary$id, trial.summary[[i]]), col = area.col[i], lwd = 3)
}
# add legend to the plot. Set at the top right of the plot. The name of the legend correspond to the names of the areas. Also, assign the color, set the line type, and size of the legend as well.
legend("topright",
legend = colnames(trial.summary)[1:n.area],
col = area.col,
lty = 1,
cex = 0.8
)
plot.trial <- function(i.t, area, area.col, this_session){
spks = this_session$spks[[i.t]]; # the spike data for this trial
n.neuron = dim(spks)[1] # number of neurons
time.points = this_session$time[[i.t]] # time points for the trial
# build the plot
# initial plot coordinates, the limits of x-axis is the min and max of time.point vector; the limits of y-axis is from 0 to the number of neurons plus 1; suppress the y-axis ticks; adjust the size of the axis labels.
plot(0, 0, xlim = c(min(time.points), max(time.points)), ylim = c(0, n.neuron + 1), col = 'white', xlab = 'Time (s)', yaxt = 'n', ylab = 'Neuron', main = paste('Trial ',i.t, 'feedback', this_session$feedback_type[i.t] ), cex.lab = 1.5)
# build a for loop for each neuron i from 1 to n.neuron
for(i in 1:n.neuron){
i.a = which(area == this_session$brain_area[i]); # identify the position in the area vector that correspond to the current neuron's area
col.this = area.col[i.a] # the area.col vector contains a set of colors with different areas and col.this select the color with current neuron's area
ids.spike = which(spks[i,] > 0) # find the indices in spks matrix where the value is greater than 0, which means the presence of spikes.
if(length(ids.spike) > 0 ){ # check if there are spikes present for the current neuron, if its length is greater than 0, indicating the presence of spikes.
points(x = time.points[ids.spike], y = rep(i, length(ids.spike) ), pch = '.', cex = 2, col = col.this) # plot individual points at time.points[ids.spike] and the neuron's position on the y-axis. The points are represented by '.'. The size are 2, using the 'col.this' color.
}
}
legend("topright", # add a legend at the top right corner of the plot
legend = area, # the label of the legend is area
col = area.col, # with the corresponding colors of each brain area
pch = 16, # the plotting symbol for the legend is a solid circle
cex = 0.8 # the size of legend is 0.8
)
}
varname = names(trial.summary); # extract the column names from trial.summary
area = varname[1:(length(varname) - 4)] # create a new variable called area with the 'varname' excluding the last four elements.
plot.trial(1,area, area.col,session[[i.s]]) # generate plot for first trial with brain area names, corresponding colors of brain area, session data: session[[i.s]]
varname = names(trial.summary); # extract the column names from trial.summary
area = varname[1:(length(varname) - 4)] # create a new variable called area with the 'varname' excluding the last four elements.
par(mfrow = c(1,2)) # command the plotting layout
plot.trial(1, area, area.col, session[[i.s]]) # the first plot of the first trial with the spike activity of neurons in each brain area over time
plot.trial(2, area, area.col, session[[i.s]]) # the second plot of the second trial with the spike activity in the same brain area as the first plot but for different trial
average_spike_area <- function(i.t, this_session) {
spk.trial <- this_session$spks[[i.t]] # Get the spike data for the given trial
area <- this_session$brain_area # Get the brain area information
spk.count <- apply(spk.trial, 1, sum) # Sum the spike counts for each neuron
spk.average.tapply <- tapply(spk.count, area, mean) # Calculate average spike count for each brain area
return(spk.average.tapply) # Return the average spike count by brain area
}
plot_trial <- function(i.t, area, area.col, this_session) {
spks <- this_session$spks[[i.t]] # Get the spike data for the given trial
n.neuron <- dim(spks)[1] # Get the number of neurons
time.points <- this_session$time[[i.t]] # Get the time points
# Set up the plot with (0, 0) coordinates, the limits of x-axis is the min and max of time.point vector; the limits of y-axis is from 0 to the number of neurons plus 1; suppress the y-axis ticks; adjust the size of the axis labels.
plot(0, 0, xlim = c(min(time.points), max(time.points)), ylim = c(0, n.neuron + 1),
col = 'white', xlab = 'Time (s)', yaxt = 'n', ylab = 'Neuron',
main = paste('Trial', i.t, 'Feedback:', this_session$feedback_type[i.t]), cex.lab = 1.5)
# Iterate over each neuron and plot its spike activity
for (i in 1:n.neuron) {
i.a <- which(area == this_session$brain_area[i]) # Find the index of the neuron's brain area
col.this <- area.col[i.a] # Get the color for the neuron's brain area
ids.spike <- which(spks[i, ] > 0) # Find the time points when the neuron has spikes
if (length(ids.spike) > 0) {
points(x = time.points[ids.spike], y = rep(i, length(ids.spike)), pch = '.', cex = 2, col = col.this) # Plot the spikes as points on the plot
}
}
# Add a legend to the plot indicating the brain areas
legend("topright", legend = area, col = area.col, pch = 16, cex = 0.8)
}
n.trial <- length(session[[i.s]]$feedback_type) # the number of trials present in the session
n.area <- length(unique(session[[i.s]]$brain_area)) # the number of distinct brain areas recorded in the session
# create a matrix with n.trial rows and n.area + 1 + 2 + 1 columns
trial.summary <- matrix(nrow = n.trial, ncol = n.area + 1 + 2 + 1)
# build a for loop that iterates over each trial from 1 to n.trial
for (i.t in 1:n.trial) {
trial.summary[i.t, ] <- c(average_spike_area(i.t, this_session = session[[i.s]]), # calculates the average spike area for the i.t-th trial
session[[i.s]]$feedback_type[i.t], # get the feedback type for the i.t-th trial from the session data
session[[i.s]]$contrast_left[i.t], # get left contrast values for the i.t-th trial
session[[i.s]]$contrast_right[i.s], # get right contrast values for the i.t-th trial
i.t)
}
# get the names of the average spike area values calculated for each brain area in the current session
colnames(trial.summary) <- c(names(average_spike_area(i.t, this_session = session[[i.s]])), 'feedback', 'left contr.', 'right contr.', 'id')
# converts the trial.summary matrix into a tibble
trial.summary <- as_tibble(trial.summary)
# generates a vector area.col that contains colors
area.col <- rainbow(n = n.area, alpha = 0.7)
# sets the margin sizes for the plot region
par(mar = c(2, 2, 2, 2))
# determine the number of rows for arranging the plots
n_row <- ceiling(n.trial / 2)
# determine the number of columns for arranging the plots
n_col <- min(n.trial, 2)
# start a new plot
plot.new()
# iterate over each trial (i.t) from 1 to the total number of trials
for (i.t in 1:n.trial) {
area <- names(trial.summary)[1:(length(names(trial.summary)) - 4)] # The area variable is assigned a subset of the column names of the trial.summary data frame excluding the last four column names
plot_trial(i.t, area, area.col, session[[i.s]]) # creates a plot
}
par(mar = c(5, 4, 4, 2) + 0.1)
# two empty lists to store summary data for each trial and session
trial_summary_list <- list()
session_summary_list <- list()
# build for loop over each session
for (i in 1:18) {
# Initialize empty vectors to store summary data for each session for all session numbers, feedback types...
session_numbers <- numeric()
feedback_types <- numeric()
contrast_lefts <- numeric()
contrast_rights <- numeric()
spks_means <- numeric()
spks_sds <- numeric()
# build another for loop
for (j in 1:length(session[[i]]$feedback_type)) {
feedback_type <- session[[i]]$feedback_type[j] # extract feedback_type
contrast_left <- session[[i]]$contrast_left[j] # extract contrast_left
contrast_right <- session[[i]]$contrast_right[j] # extract contrast_right
spks <- session[[i]]$spks[[j]] # extract spks
# get mean of the matrix
spks_mean <- mean(c(spks))
# get standard deviation of the matrix
spks_sd <- sd(c(spks))
# add summary data to corresponding vectors
session_numbers <- c(session_numbers, i)
feedback_types <- c(feedback_types, feedback_type)
contrast_lefts <- c(contrast_lefts, contrast_left)
contrast_rights <- c(contrast_rights, contrast_right)
spks_means <- c(spks_means, spks_mean)
spks_sds <- c(spks_sds, spks_sd)
}
# create the data frame by all collected summary data
session_summary <- data.frame(
session_number = session_numbers,
feedback_type = feedback_types,
contrast_left = contrast_lefts,
contrast_right = contrast_rights,
spks_mean = spks_means,
spks_sd = spks_sds
)
# add session_summary to the session_summary_list
session_summary_list[[i]] <- session_summary
# add trial_summary to the trial_summary_list
trial_summary_list <- c(trial_summary_list, session_summary)
}
# After the for loop completed, combine all data into a new data frame
session_all <- do.call(rbind, session_summary_list)
# PCA for all the selected variables
PCA.data <- session_all[, c("contrast_left", "contrast_right", "spks_mean", "spks_sd")]
# Also use the scale()
PCA.data <- scale(PCA.data)
# conduct PCA by prcomp()
PCA.result <- prcomp(PCA.data, scale. = TRUE)
# PCA result transformed into the new data frame
PCA_data_frame <- as.data.frame(PCA.result$x)
# the session number is added as a column for coloring
PCA_data_frame$session_number <- session_all$session_number
# create scatter plot
ggplot(PCA_data_frame, aes(x = PC1, y = PC2, color = as.factor(session_number))) + geom_point() + labs(color = "Session Number") + theme_minimal() + ggtitle("PCA Plot")
# By observe the scatter plot, I find that the points in the middle are
clustering. The points on the both sides are more disperse. In my
opinion, the sessions in middle are simliar.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(42)
# store the processed data for each session
session_data = list()
# bulid a for loop to iterate over each session in the list.
for(i in 1:length(session)) {
feedback_type = session[[i]]$feedback_type # extract 'feedback_type' and store in a variable
spk.counts = sapply(session[[i]]$spks, function(x) sum(rowSums(x))) # calculate the sum of row sums for each element, and store in a variable
ContrastLeft = session[[i]]$contrast_left # extract 'contrast_left' and store in a variable
ContrastRight = session[[i]]$contrast_right # extract 'contrast_right' and store in a variable
session_data[[i]] = data.frame(feedback_type, spk.counts, ContrastLeft, ContrastRight) # create a data frame by all the variables we extracted above
}
NewData = do.call(rbind, session_data) # bind all data frames from 'session_data'
NewData$feedback_type = as.factor(NewData$feedback_type) # convert feedback_type to a factor
# split the data to training data and test data by using createDataPartition. Since we want feedback_type as the outcome, and split 70% of data to training and 30% to test.
Train = createDataPartition(NewData$feedback_type, p = 0.7, list = FALSE)
train_data = NewData[Train, ]
test_data = NewData[-Train,]
# we want to use logic regression. feedback_type is the outcome, following the binomial distribution.
model = train(feedback_type ~ ., data = train_data, method = "glm", family = "binomial")
model # to see our model
## Generalized Linear Model
##
## 3558 samples
## 3 predictor
## 2 classes: '-1', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3558, 3558, 3558, 3558, 3558, 3558, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7061096 0
# to make prediction on test_data using the train model.
prediction_session = predict(model, newdata = test_data)
# get the confusion matrix by comparing prediction_session and test_data$feedback_type
ConfusionMatrix = confusionMatrix(prediction_session, test_data$feedback_type)
ConfusionMatrix # to see our confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction -1 1
## -1 0 0
## 1 441 1082
##
## Accuracy : 0.7104
## 95% CI : (0.6869, 0.7331)
## No Information Rate : 0.7104
## P-Value [Acc > NIR] : 0.5128
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.7104
## Prevalence : 0.2896
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : -1
##
test1 <- readRDS("/Users/zejianzhang/Downloads/test/test1.rds")
test2 <- readRDS("/Users/zejianzhang/Downloads/test/test2.rds")
setwd("/Users/zejianzhang/Downloads/test")
mouseanddate = data.frame(mouse_name = character(), data_exp = character())
# build a for loop to iterate from 1 to 2 to read two data files
for(i in 1:2){
session[[i]] = readRDS(paste("test", i, ".rds", sep = ""))
print(session[[i]]$mouse_name)
print(session[[i]]$date_exp)
}
## [1] "Cori"
## [1] "2016-12-14"
## [1] "Lederberg"
## [1] "2017-12-11"
set.seed(42)
# empty list is used to store data
test = list()
# build a for loop to iterate from 1 to 2
for(i in 1:2) {
feedback_type = session[[i]]$feedback_type # extract feedback_type
spk.counts = sapply(session[[i]]$spks, function(x) sum(rowSums(x))) # extract spks
ContrastLeft = session[[i]]$contrast_left # extract contrast_left
ContrastRight = session[[i]]$contrast_right # extract contrast_right
test[[i]] = data.frame(feedback_type, spk.counts, ContrastLeft, ContrastRight) # all the extracted data are combined in data frame
}
# combine the data frame
new_test = do.call(rbind, test)
# converting
new_test$feedback_type = as.factor(new_test$feedback_type)
# split data
test_train = createDataPartition(new_test$feedback_type, p = 0.7, list = FALSE)
test_train_data = new_test[test_train, ]
test_data = new_test[-test_train,]
# train a logic regression model
test_model = train(feedback_type ~ ., data = test_data, method = "glm", family = "binomial")
test_model # to see our model
## Generalized Linear Model
##
## 59 samples
## 3 predictor
## 2 classes: '-1', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 59, 59, 59, 59, 59, 59, ...
## Resampling results:
##
## Accuracy Kappa
## 0.6871939 0.08260289
# prediction
test_pred = predict(test_model, newdata = test_data)
# confusion matrix
test_confusionmatrix = confusionMatrix(test_pred, test_data$feedback_type)
test_confusionmatrix # to see our confusion matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction -1 1
## -1 3 1
## 1 13 42
##
## Accuracy : 0.7627
## 95% CI : (0.6341, 0.8638)
## No Information Rate : 0.7288
## P-Value [Acc > NIR] : 0.337114
##
## Kappa : 0.2148
##
## Mcnemar's Test P-Value : 0.003283
##
## Sensitivity : 0.18750
## Specificity : 0.97674
## Pos Pred Value : 0.75000
## Neg Pred Value : 0.76364
## Prevalence : 0.27119
## Detection Rate : 0.05085
## Detection Prevalence : 0.06780
## Balanced Accuracy : 0.58212
##
## 'Positive' Class : -1
##
Steinmetz, N.A., Zatka-Haas, P., Carandini, M. et al. Distributed coding of choice, action and engagement across the mouse brain. Nature 576, 266–273 (2019). https://doi.org/10.1038/s41586-019-1787-x